{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import lxml.etree as le\n",
    "from pandas.core.frame import DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 目标网站\n",
    "url = 'https://www.runoob.com/html/html-tutorial.html'\n",
    "# XPath语句\n",
    "x1 = '//div[@id=\"leftcolumn\"]/a[@target=\"_top\"]/text()'\n",
    "x2 = '//div[@id=\"leftcolumn\"]/a[@target=\"_top\"]/@href'\n",
    "# 网站源码\n",
    "content = requests.get(url).content\n",
    "# 1 原生的lxml------------------------------------\n",
    "# 把HTML源码转成XML\n",
    "contentx = le.HTML(content)\n",
    "# 进行XPath数据提取\n",
    "xiaojie_texts = ktool.xpath.xpath_all(content,x1)\n",
    "xiaojie_hrefs = contentx.xpath(x2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "c={\"小节名称\" : xiaojie_texts,\n",
    "   \"链接地址\" : xiaojie_hrefs}\n",
    "data=DataFrame(c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['链接地址'] = 'https://www.runoob.com' + data['链接地址']\n",
    "data['小节名称'] = data['小节名称'].str.replace(r'\\t','').str.replace(r'\\n','').str.replace(r'\\r','').str.strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>小节名称</th>\n",
       "      <th>链接地址</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HTML 教程</td>\n",
       "      <td>https://www.runoob.com/html/html-tutorial.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HTML 简介</td>\n",
       "      <td>https://www.runoob.com/html/html-intro.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HTML 编辑器</td>\n",
       "      <td>https://www.runoob.com/html/html-editors.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HTML 基础</td>\n",
       "      <td>https://www.runoob.com/html/html-basic.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HTML 元素</td>\n",
       "      <td>https://www.runoob.com/html/html-elements.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>HTML 属性</td>\n",
       "      <td>https://www.runoob.com/html/html-attributes.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>HTML 标题</td>\n",
       "      <td>https://www.runoob.com/html/html-headings.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>HTML 段落</td>\n",
       "      <td>https://www.runoob.com/html/html-paragraphs.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>HTML 文本格式化</td>\n",
       "      <td>https://www.runoob.com/html/html-formatting.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>HTML 链接</td>\n",
       "      <td>https://www.runoob.com/html/html-links.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>HTML 头部</td>\n",
       "      <td>https://www.runoob.com/html/html-head.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>HTML CSS</td>\n",
       "      <td>https://www.runoob.com/html/html-css.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>HTML 图像</td>\n",
       "      <td>https://www.runoob.com/html/html-images.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>HTML 表格</td>\n",
       "      <td>https://www.runoob.com/html/html-tables.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>HTML 列表</td>\n",
       "      <td>https://www.runoob.com/html/html-lists.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>HTML 区块</td>\n",
       "      <td>https://www.runoob.com/html/html-blocks.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>HTML 布局</td>\n",
       "      <td>https://www.runoob.com/html/html-layouts.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>HTML 表单</td>\n",
       "      <td>https://www.runoob.com/html/html-forms.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>HTML 框架</td>\n",
       "      <td>https://www.runoob.com/html/html-iframes.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>HTML 颜色</td>\n",
       "      <td>https://www.runoob.com/html/html-colors.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>HTML 颜色名</td>\n",
       "      <td>https://www.runoob.com/html/html-colornames.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>HTML 颜色值</td>\n",
       "      <td>https://www.runoob.com/html/html-colorvalues.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>HTML 脚本</td>\n",
       "      <td>https://www.runoob.com/html/html-scripts.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>HTML 字符实体</td>\n",
       "      <td>https://www.runoob.com/html/html-entities.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>HTML URL</td>\n",
       "      <td>https://www.runoob.com/html/html-url.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>HTML 速查列表</td>\n",
       "      <td>https://www.runoob.com/html/html-quicklist.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>HTML 标签简写及全称</td>\n",
       "      <td>https://www.runoob.com/html/html-tag-name.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>HTML 总结</td>\n",
       "      <td>https://www.runoob.com/html/html-summary.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>XHTML 简介</td>\n",
       "      <td>https://www.runoob.com/html/html-xhtml.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>HTML5 教程</td>\n",
       "      <td>https://www.runoob.com/html/html5-intro.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>HTML5 Web SQL</td>\n",
       "      <td>https://www.runoob.comhtml5-web-sql.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>HTML5 应用程序缓存</td>\n",
       "      <td>https://www.runoob.com/html/html5-app-cache.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>HTML5 Web Workers</td>\n",
       "      <td>https://www.runoob.com/html/html5-webworkers.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>HTML5 SSE</td>\n",
       "      <td>https://www.runoob.com/html/html5-serversentev...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>HTML5 WebSocket</td>\n",
       "      <td>https://www.runoob.com/html/html5-websocket.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>HTML5 测验</td>\n",
       "      <td>https://www.runoob.com/quiz/html5-quiz.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>HTML(5) 代码规范</td>\n",
       "      <td>https://www.runoob.com/html/html5-syntax.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>HTML 媒体(Media)</td>\n",
       "      <td>https://www.runoob.com/html/html-media.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>HTML 插件</td>\n",
       "      <td>https://www.runoob.com/html/html-object.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>HTML 音频(Audio)</td>\n",
       "      <td>https://www.runoob.com/html/html-sounds.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>HTML 视频（Video）播放</td>\n",
       "      <td>https://www.runoob.com/html/html-videos.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>HTML 实例</td>\n",
       "      <td>https://www.runoob.com/html/html-examples.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>HTML 标签列表(字母排序)</td>\n",
       "      <td>https://www.runoob.com/tags/html-reference.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>HTML 标签列表（功能排序）</td>\n",
       "      <td>https://www.runoob.com/tags/ref-byfunc.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>HTML 属性</td>\n",
       "      <td>https://www.runoob.com/tags/ref-standardattrib...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>HTML 事件</td>\n",
       "      <td>https://www.runoob.com/tags/ref-eventattribute...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60</th>\n",
       "      <td>HTML 画布</td>\n",
       "      <td>https://www.runoob.com/tags/ref-canvas.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>HTML 音频/视频</td>\n",
       "      <td>https://www.runoob.com/tags/ref-av-dom.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>HTML 有效DOCTYPES</td>\n",
       "      <td>https://www.runoob.com/tags/html-elementsdocty...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>HTML 颜色名</td>\n",
       "      <td>https://www.runoob.com/tags/html-colorname.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>64</th>\n",
       "      <td>HTML 拾色器</td>\n",
       "      <td>https://www.runoob.com/tags/html-colorpicker.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>HTML 字符集</td>\n",
       "      <td>https://www.runoob.com/charsets/html-charsets....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>66</th>\n",
       "      <td>HTML ASCII</td>\n",
       "      <td>https://www.runoob.com/tags/html-ascii.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67</th>\n",
       "      <td>HTML ISO-8859-1</td>\n",
       "      <td>https://www.runoob.com/tags/ref-entities.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68</th>\n",
       "      <td>HTML 符号</td>\n",
       "      <td>https://www.runoob.com/tags/html-symbols.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69</th>\n",
       "      <td>HTML URL 编码</td>\n",
       "      <td>https://www.runoob.com/tags/html-urlencode.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70</th>\n",
       "      <td>HTML 语言代码</td>\n",
       "      <td>https://www.runoob.com/tags/html-language-code...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71</th>\n",
       "      <td>HTTP 消息</td>\n",
       "      <td>https://www.runoob.com/tags/html-httpmessages....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72</th>\n",
       "      <td>HTTP 方法</td>\n",
       "      <td>https://www.runoob.com/tags/html-httpmethods.html</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>键盘快捷键</td>\n",
       "      <td>https://www.runoob.com/tags/html-keyboardshort...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>74 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 小节名称                                               链接地址\n",
       "0             HTML 教程     https://www.runoob.com/html/html-tutorial.html\n",
       "1             HTML 简介        https://www.runoob.com/html/html-intro.html\n",
       "2            HTML 编辑器      https://www.runoob.com/html/html-editors.html\n",
       "3             HTML 基础        https://www.runoob.com/html/html-basic.html\n",
       "4             HTML 元素     https://www.runoob.com/html/html-elements.html\n",
       "5             HTML 属性   https://www.runoob.com/html/html-attributes.html\n",
       "6             HTML 标题     https://www.runoob.com/html/html-headings.html\n",
       "7             HTML 段落   https://www.runoob.com/html/html-paragraphs.html\n",
       "8          HTML 文本格式化   https://www.runoob.com/html/html-formatting.html\n",
       "9             HTML 链接        https://www.runoob.com/html/html-links.html\n",
       "10            HTML 头部         https://www.runoob.com/html/html-head.html\n",
       "11           HTML CSS          https://www.runoob.com/html/html-css.html\n",
       "12            HTML 图像       https://www.runoob.com/html/html-images.html\n",
       "13            HTML 表格       https://www.runoob.com/html/html-tables.html\n",
       "14            HTML 列表        https://www.runoob.com/html/html-lists.html\n",
       "15            HTML 区块       https://www.runoob.com/html/html-blocks.html\n",
       "16            HTML 布局      https://www.runoob.com/html/html-layouts.html\n",
       "17            HTML 表单        https://www.runoob.com/html/html-forms.html\n",
       "18            HTML 框架      https://www.runoob.com/html/html-iframes.html\n",
       "19            HTML 颜色       https://www.runoob.com/html/html-colors.html\n",
       "20           HTML 颜色名   https://www.runoob.com/html/html-colornames.html\n",
       "21           HTML 颜色值  https://www.runoob.com/html/html-colorvalues.html\n",
       "22            HTML 脚本      https://www.runoob.com/html/html-scripts.html\n",
       "23          HTML 字符实体     https://www.runoob.com/html/html-entities.html\n",
       "24           HTML URL          https://www.runoob.com/html/html-url.html\n",
       "25          HTML 速查列表    https://www.runoob.com/html/html-quicklist.html\n",
       "26       HTML 标签简写及全称     https://www.runoob.com/html/html-tag-name.html\n",
       "27            HTML 总结      https://www.runoob.com/html/html-summary.html\n",
       "28           XHTML 简介        https://www.runoob.com/html/html-xhtml.html\n",
       "29           HTML5 教程       https://www.runoob.com/html/html5-intro.html\n",
       "..                ...                                                ...\n",
       "44      HTML5 Web SQL           https://www.runoob.comhtml5-web-sql.html\n",
       "45       HTML5 应用程序缓存   https://www.runoob.com/html/html5-app-cache.html\n",
       "46  HTML5 Web Workers  https://www.runoob.com/html/html5-webworkers.html\n",
       "47          HTML5 SSE  https://www.runoob.com/html/html5-serversentev...\n",
       "48    HTML5 WebSocket   https://www.runoob.com/html/html5-websocket.html\n",
       "49           HTML5 测验        https://www.runoob.com/quiz/html5-quiz.html\n",
       "50       HTML(5) 代码规范      https://www.runoob.com/html/html5-syntax.html\n",
       "51     HTML 媒体(Media)        https://www.runoob.com/html/html-media.html\n",
       "52            HTML 插件       https://www.runoob.com/html/html-object.html\n",
       "53     HTML 音频(Audio)       https://www.runoob.com/html/html-sounds.html\n",
       "54   HTML 视频（Video）播放       https://www.runoob.com/html/html-videos.html\n",
       "55            HTML 实例     https://www.runoob.com/html/html-examples.html\n",
       "56    HTML 标签列表(字母排序)    https://www.runoob.com/tags/html-reference.html\n",
       "57    HTML 标签列表（功能排序）        https://www.runoob.com/tags/ref-byfunc.html\n",
       "58            HTML 属性  https://www.runoob.com/tags/ref-standardattrib...\n",
       "59            HTML 事件  https://www.runoob.com/tags/ref-eventattribute...\n",
       "60            HTML 画布        https://www.runoob.com/tags/ref-canvas.html\n",
       "61         HTML 音频/视频        https://www.runoob.com/tags/ref-av-dom.html\n",
       "62    HTML 有效DOCTYPES  https://www.runoob.com/tags/html-elementsdocty...\n",
       "63           HTML 颜色名    https://www.runoob.com/tags/html-colorname.html\n",
       "64           HTML 拾色器  https://www.runoob.com/tags/html-colorpicker.html\n",
       "65           HTML 字符集  https://www.runoob.com/charsets/html-charsets....\n",
       "66         HTML ASCII        https://www.runoob.com/tags/html-ascii.html\n",
       "67    HTML ISO-8859-1      https://www.runoob.com/tags/ref-entities.html\n",
       "68            HTML 符号      https://www.runoob.com/tags/html-symbols.html\n",
       "69        HTML URL 编码    https://www.runoob.com/tags/html-urlencode.html\n",
       "70          HTML 语言代码  https://www.runoob.com/tags/html-language-code...\n",
       "71            HTTP 消息  https://www.runoob.com/tags/html-httpmessages....\n",
       "72            HTTP 方法  https://www.runoob.com/tags/html-httpmethods.html\n",
       "73              键盘快捷键  https://www.runoob.com/tags/html-keyboardshort...\n",
       "\n",
       "[74 rows x 2 columns]"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_excel('7.张京瑶-python模块六作业.xlsx', index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
